t-SNE is a dimensionality reduction technique that maps high-dimensional data to a lower-dimensional space while preserving local and global structure.
Note: Better and faster if PCA is first applied on data set and then run t-SNE on the 50 PC’s (some packages does this on their own, however the Seurat t-SNE does not).
t-SNE uses pairwise similarities between data points (usually measured by Euclidean distance) to construct a similarity matrix and then use conditional probability of a point being neighbor to another point under a student’s t-distribution to rearrange the data points in a low-dimensional space. This makes t-SNE adapt to handling non-linear data structures.
Note: To preserve clusters/partitions in the high-dimensional space, we want to scale the similarity values to have standard deviation equal to 1, so that density between a group of points will be seen as the same, even though a group may be less dense.
t-SNE and UMAP are quite similar methods.
setwd("~/sc_covid_PiB2023/data_science_project/Data")
data <- read_h5ad("Pilot_2_rule_them_ALL.h5ad") # our pilot data
In this section we run t-SNE on the data on which PCA has been used to dimension reduced.
We start by running it on the different cell types using ‘RunTsne’ from the ‘Seurat’ package. Running t-SNE on a cell type level allows for the comparison of infected and uninfected cells within each cell type.
# Making a dataframe (used for plotting w. ggplots) with t-SNE coordinates
celltypes <- unique(data$obs$cellType)
df <- as.data.frame(data$obs) %>%
add_column(tSNE_1 = 0) %>%
add_column(tSNE_2 = 0)
for (c in celltypes){
#print(c) # To show progress of loop
c_subset = data[which(data$obs$cellType == c)] # Subsetting for each celltype
PCA <- RunPCA(c_subset$X, assay = NULL, npcs = 50, rev.pca = F, weight.by.var = T, verbose = F)
c_cells_tsne <- RunTSNE(
PCA[],
assay = "RNA",
seed.use = 1,
tsne.method = "Rtsne",
dim.embed = 2,
max_iter = 500)
df[df$cellType == c,]$tSNE_1 <- c_cells_tsne[[,1]]
df[df$cellType == c,]$tSNE_2 <- c_cells_tsne[[,2]]
}
Plot colored by infection status.
df %>%
arrange(viralLoad) %>%
ggplot(aes(x = tSNE_1, y = tSNE_2, color = Is_infected)) +
geom_point(size = 1.5, alpha = 0.6) +
facet_wrap("cellType", scales = "free", labeller = labeller()) +
scale_color_manual(values = c("0" = "blue", "1" = "red")) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(title = "t-SNE cell type") +
xlab("tSNE_1") +
ylab("tSNE_2")
Plot coloredd by the Patient ID
df %>%
arrange(viralLoad) %>%
ggplot(aes(x = tSNE_1, y = tSNE_2, color = PatientID)) +
geom_point(size = 1.5, alpha = 0.6) +
facet_wrap("cellType", scales = "free", labeller = labeller()) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(title = "t-SNE cell type") +
xlab("tSNE_1") +
ylab("tSNE_2")
We try clustering for infection status on cell level.
celltypes <- unique(data$obs$cellType)
matrix_of_ari <- matrix(1:48, nrow = 8, ncol = 6,
dimnames = list(celltypes,
c("PCA_kmeans", "PCA_hier", "scVI_kmeans", "scVI_hier", "cor_kmeans", "cor_hier")))
n_clusters = 2 # number of clusters
df["kmeans"] <- 0
df["hierarchical"] <- 0
for (c in celltypes){
c_subset = df[df$cellType == c,]
max_ari = -1
# Kmeans:
cluster <- kmeans(c_subset[, 9:10], iter = 10000, nstart = 100, n_clusters) # kmeans clustering
matrix_of_ari[c, "PCA_kmeans"] = adj.rand.index(cluster$cluster, c_subset$Is_infected) # ARI for kmeans clustering
df[df$cellType == c,]$kmeans <- factor(cluster$cluster)
# Hierarchical
for (m in c("complete", "single", "average", "centroid")){
dm <- dist(as.data.frame(c_subset[, 9:10])) # Distance matrix
hc <- hclust(dm, method = "centroid") # simple dendrogram
clusterCutS <- cutree(hc, n_clusters) # data, number of clusters
ari = adj.rand.index(clusterCutS, c_subset$Is_infected)
df[df$cellType == c,]$hierarchical <- factor(clusterCutS)
if (max_ari < ari) {
matrix_of_ari[c, "PCA_hier"] <- ari
max_ari = ari}
}}
matrix_of_ari <- round(matrix_of_ari, 4)
matrix_of_ari[,1:2]
## PCA_kmeans PCA_hier
## Macrophage 0.0443 0.0005
## Plasma 0.0677 0.0042
## Secretory 0.3723 0.0002
## Neutrophil 0.0019 0.0000
## NK -0.0028 0.0001
## T 0.0956 0.0169
## Ciliated 0.1021 -0.0001
## Squamous 0.0417 -0.0004
ggplot(df, aes(x = tSNE_1, y = tSNE_2 , color = factor(kmeans))) +
geom_point(size = 1.5, alpha = 0.6) +
facet_wrap("cellType", scales = "free", labeller = labeller()) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(title = "Kmeans on tSNE", subtitle = "Dimension reduction with PCA") +
xlab("tSNE_1") +
ylab("tSNE_2") +
theme(legend.position = "none")
ggplot(df, aes(x = tSNE_1, y = tSNE_2 , color = factor(hierarchical))) +
geom_point(size = 1.5, alpha = 0.6) +
facet_wrap("cellType", scales = "free", labeller = labeller()) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(title = "Hierarchical on tSNE", subtitle = "Dimension reduction with PCA\nClustered by infection; k = 2") +
xlab("tSNE_1") +
ylab("tSNE_2") +
theme(legend.position = "none")
This loop-based approach utilizes the adjusted Rand index (ARI) as a measure to guide the search for the best parameter values. By systematically iterating through different combinations of parameters and evaluating their performance using the ARI, this approach helps identify the parameter values that yield the highest similarity between predicted and true labels. It saves the coordinates in the dataset to be used for clustering plots in ‘Clean_Clustering’.
perplexity_list <- c(5, 30, 50, 100, 150)
theta_list <- c(0.15, 0.25, 0.5, 0.8, 0.99)
n_clusters = 8 # number of clusters
max_ARI_kmeans_tsne = 0
max_ARI_hier_tsne = 0
for (p in perplexity_list) {
for (t in theta_list) {
tsne_test <- RunTSNE(
data$obsm$X_pca,
assay = "RNA",
seed.use = 10,
tsne.method = "Rtsne",
dim.embed = 2,
perplexity = p,
theta = t
)
cluster <- kmeans(tsne_test[[, 1:2]], iter = 10000, nstart = 100, n_clusters) # kmeans clustering
ARI_tsne = adj.rand.index(cluster$cluster, data$obs$cellType) # ARI for kmeans clustering
if (max_ARI_kmeans_tsne < ARI_tsne) {
max_ARI_kmeans_tsne = ARI_tsne
best_kmeans_tsne <- tsne_test}
for (m in c("complete", "single", "average", "centroid")){
dm <- dist(as.data.frame(tsne_test[[, 1:2]])) # Distance matrix
hc <- hclust(dm, method = m) # simple dendrogram
clusterCutS <- cutree(hc, n_clusters) # data, number of clusters
ARI_hier_tsne = adj.rand.index(clusterCutS, data$obs$cellType)
if (max_ARI_hier_tsne < ARI_hier_tsne) {
max_ARI_hier_tsne = ARI_hier_tsne
best_hier_tsne <- tsne_test}
}
}
}
# We save the coordinates that gives the highest ARI in the pilot set to plot the clusterings later.
if (max_ARI_kmeans_tsne > max_ARI_hier_tsne) {
matrix_data <-
matrix(
data = c(best_kmeans_tsne[[, 1]], best_kmeans_tsne[[, 2]]),
nrow = length(best_kmeans_tsne[[, 1]]),
ncol = 2
)
data$obsm$X_PCA_tSNE <- matrix_data
write_h5ad(data, '../Data/Pilot_2_rule_them_ALL.h5ad')
} else {
matrix_data <-
matrix(
data = c(best_hier_tsne[[, 1]], best_hier_tsne[[, 2]]),
nrow = length(best_hier_tsne[[, 1]]),
ncol = 2
)
data$obsm$X_PCA_tSNE <- matrix_data
write_h5ad(data, '../Data/Pilot_2_rule_them_ALL.h5ad')
}
In this section we run t-SNE on the data on which scVI has been used to dimension reduced.
We start by running it on the different cell types using ‘RunTsne’ from the ‘Seurat’ package. Running t-SNE on a cell type level allows for the comparison of infected and uninfected cells within each cell type.
celltypes <- unique(data$obs$cellType)
df <- as.data.frame(data$obs) %>%
add_column(tSNE_1 = 0) %>%
add_column(tSNE_2 = 0)
for (c in celltypes){
print(c) # To show progress of loop
c_subset = data[which(data$obs$cellType == c)]
c_cells_tsne <- RunTSNE(
c_subset$obsm$X_scVI,
assay = "RNA",
seed.use = 1,
tsne.method = "Rtsne",
dim.embed = 2,
max_iter = 500)
df[df$cellType == c,]$tSNE_1 <- c_cells_tsne[[,1]]
df[df$cellType == c,]$tSNE_2 <- c_cells_tsne[[,2]]
}
## [1] "Macrophage"
## [1] "Plasma"
## [1] "Secretory"
## [1] "Neutrophil"
## [1] "NK"
## [1] "T"
## [1] "Ciliated"
## [1] "Squamous"
Plot colored by the infection status of each cell.
df %>%
arrange(viralLoad) %>%
ggplot(aes(x = tSNE_1, y = tSNE_2, color = Is_infected)) +
geom_point(size = 1.5, alpha = 0.6) +
facet_wrap("cellType", scales = "free", labeller = labeller()) +
scale_color_manual(values = c("0" = "blue", "1" = "red")) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(title = "t-SNE cell type") +
xlab("tSNE_1") +
ylab("tSNE_2")
Plot colored by patient id.
df %>%
arrange(viralLoad) %>%
ggplot(aes(x = tSNE_1, y = tSNE_2, color = PatientID)) +
geom_point(size = 1.5, alpha = 0.6) +
facet_wrap("cellType", scales = "free", labeller = labeller()) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(title = "t-SNE cell type") +
xlab("tSNE_1") +
ylab("tSNE_2")
We try clustering for infection status on cell level.
celltypes <- unique(data$obs$cellType)
n_clusters = 2 # number of clusters
df["kmeans"] <- 0
df["hierarchical"] <- 0
for (c in celltypes){
c_subset = df[df$cellType == c,]
max_ari = -1
# Kmeans:
cluster <- kmeans(c_subset[, 9:10], iter = 10000, nstart = 100, n_clusters) # kmeans clustering
matrix_of_ari[c, "scVI_kmeans"] = adj.rand.index(cluster$cluster, c_subset$Is_infected) # ARI for kmeans clustering
df[df$cellType == c,]$kmeans <- factor(cluster$cluster)
# Hierarchical
for (m in c("complete", "single", "average", "centroid")){
dm <- dist(as.data.frame(c_subset[, 9:10])) # Distance matrix
hc <- hclust(dm, method = "centroid") # simple dendrogram
clusterCutS <- cutree(hc, n_clusters) # data, number of clusters
ari = adj.rand.index(clusterCutS, c_subset$Is_infected)
df[df$cellType == c,]$hierarchical <- factor(clusterCutS)
if (max_ari < ari) {
matrix_of_ari[c, "scVI_hier"] <- ari
max_ari = ari}
}}
matrix_of_ari <- round(matrix_of_ari, 4)
matrix_of_ari[,3:4]
## scVI_kmeans scVI_hier
## Macrophage 0.1187 0.0236
## Plasma 0.7703 0.8054
## Secretory 0.5134 0.0362
## Neutrophil -0.0009 0.1465
## NK 0.6918 0.7022
## T 0.0103 0.0972
## Ciliated 0.0336 0.0336
## Squamous -0.0014 -0.0014
ggplot(df, aes(x = tSNE_1, y = tSNE_2 , color = factor(kmeans))) +
geom_point(size = 1.5, alpha = 0.6) +
facet_wrap("cellType", scales = "free", labeller = labeller()) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(title = "Kmeans on scVI", subtitle = "Clustered by infection; k = 2") +
xlab("tSNE_1") +
ylab("tSNE_2")
ggplot(df, aes(x = tSNE_1, y = tSNE_2 , color = factor(hierarchical))) +
geom_point(size = 1.5, alpha = 0.6) +
facet_wrap("cellType", scales = "free", labeller = labeller()) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(title = "Hierarchical on scVI", subtitle = "Clustered by infection; k = 2") +
xlab("tSNE_1") +
ylab("tSNE_2")+
theme(legend.position = "none")
This loop-based approach utilizes the adjusted Rand index (ARI) as a measure to guide the search for the best parameter values. By systematically iterating through different combinations of parameters and evaluating their performance using the ARI, this approach helps identify the parameter values that yield the highest similarity between predicted and true labels. It saves the coordinates in the dataset to be used for clustering plots in ‘Clean_Clustering’.
perplexity_list <- c(5, 30, 50, 100, 150)
theta_list <- c(0.15, 0.25, 0.5, 0.8, 0.99)
n_clusters = 8 # number of clusters
max_ARI_kmeans_tsne = 0
max_ARI_hier_tsne = 0
for (p in perplexity_list) {
for (t in theta_list) {
tsne_test <- RunTSNE(
data$obsm$X_scVI,
assay = "RNA",
seed.use = 10,
tsne.method = "Rtsne",
dim.embed = 2,
perplexity = p,
theta = t
)
cluster <- kmeans(tsne_test[[, 1:2]], iter = 10000, nstart = 100, n_clusters) # kmeans clustering
ARI_tsne = adj.rand.index(cluster$cluster, data$obs$cellType) # ARI for kmeans clustering
if (max_ARI_kmeans_tsne < ARI_tsne) {
max_ARI_kmeans_tsne = ARI_tsne
tsne_kmean_per = p
tsne_kmean_theta = t
best_kmeans_tsne <- tsne_test}
for (m in c("complete", "single", "average", "centroid")){
dm <- dist(as.data.frame(tsne_test[[, 1:2]])) # Distance matrix
hc <- hclust(dm, method = m) # simple dendrogram
clusterCutS <- cutree(hc, n_clusters) # data, number of clusters
ARI_hier_tsne = adj.rand.index(clusterCutS, data$obs$cellType)
if (max_ARI_hier_tsne < ARI_hier_tsne) {
best_linkage_tsne = m
max_ARI_hier_tsne = ARI_hier_tsne
hier_tsne_per = p
hier_tsne_theta = t
best_hc = hc
best_hier_tsne <- tsne_test}
}
}
}
# We save the coordinates that gives the highest ARI in the pilot set to plot the clusterings later.
if (max_ARI_kmeans_tsne > max_ARI_hier_tsne) {
matrix_data <-
matrix(
data = c(best_kmeans_tsne[[, 1]], best_kmeans_tsne[[, 2]]),
nrow = length(best_kmeans_tsne[[, 1]]),
ncol = 2
)
data$obsm$X_scVI_tSNE <- matrix_data
write_h5ad(data, '../Data/Pilot_2_rule_them_ALL.h5ad')
} else {
matrix_data <-
matrix(
data = c(best_hier_tsne[[, 1]], best_hier_tsne[[, 2]]),
nrow = length(best_hier_tsne[[, 1]]),
ncol = 2
)
data$obsm$X_scVI_tSNE <- matrix_data
write_h5ad(data, '../Data/Pilot_2_rule_them_ALL.h5ad')
}
This loop explores various parameter combinations to find the optimal configuration for hierarchical clustering, based on the highest Adjusted Rand Index (ARI) score. The ARI scores are stored in a dataframe for creating a heatmap which offers a visual representation of the explored parameter space.
df_tsne_ari <- data.frame(matrix(nrow = 5, ncol = 5))
perplexity_list <- c(5, 30, 50, 100, 150)
theta_list <- c(0.15, 0.25, 0.5, 0.8, 0.99)
colnames(df_tsne_ari) <- perplexity_list
rownames(df_tsne_ari) <- theta_list
n = 8 # number of clusters
for (i in 1:nrow(df_tsne_ari)) {
for (j in 1:ncol(df_tsne_ari)) {
tsne_test <- RunTSNE(
data$obsm$X_scVI,
assay = "RNA",
seed.use = 10,
tsne.method = "Rtsne",
dim.embed = 2,
perplexity = perplexity_list[j],
theta = theta_list[i]
)
dm <- dist(as.data.frame(tsne_test[[, 1:2]])) # Distance matrix
hc <- hclust(dm, method = "centroid") # simple dendrogram
clusterCutS <- cutree(hc, n) # data, number of clusters
ARI <- adj.rand.index(clusterCutS, data$obs$cellType)
df_tsne_ari[i, j] <- round(ARI,4)
}
}
The heatmap displays the ARi value of differnet combinations of parameters. The best score is marked with a blue border.
ari_matrix <- as.matrix(df_tsne_ari)
# Find the maximum value and its coordinates
max_coords <- as.vector(which(ari_matrix == max(ari_matrix), arr.ind = TRUE))
# Function to mark the max value
makeRects <- function(cells){
xl=cells[2]-0.49
yb=nrow(ari_matrix)+1-cells[1]-0.49
xr=cells[2]+0.49
yt=nrow(ari_matrix)+1-cells[1]+0.49
rect(xl,yb,xr,yt,border="blue",lwd=3)
}
heatmap.2(
ari_matrix,
Colv = NA,
Rowv = NA,
dendrogram = "none",
trace = "none",
xlab = "min.dist",
ylab = "n.neigbours",
main = "Heatmap of ARI",
#col = colorRampPalette(c("white", "blue")),
notecol = "black",
key = TRUE,
cellnote = df_tsne_ari,
add.expr = {makeRects(max_coords)})
In this section we run t-SNE on the data on which scVI with batch corrections has been used to dimension reduced. The idea is that by applying batch correction with scVI, it becomes easier to identify and characterize cell types more accurately, enhancing downstream analyses such as clustering. By correcting we might also enhance the ability to discern true biological differences from technical impressions.
We start by running it on the different cell types using ‘RunTsne’ from the ‘Seurat’ package. Running t-SNE on a cell type level allows for the comparison of infected and uninfected cells within each cell type.
celltypes <- unique(data$obs$cellType)
df <- as.data.frame(data$obs) %>%
add_column(tSNE_1 = 0) %>%
add_column(tSNE_2 = 0)
for (c in celltypes){
c_subset = data[which(data$obs$cellType == c)]
c_cells_tsne <- RunTSNE(
c_subset$obsm$X_scVI_corrected,
assay = "RNA",
seed.use = 1,
tsne.method = "Rtsne",
dim.embed = 2,
max_iter = 500)
df[df$cellType == c,]$tSNE_1 <- c_cells_tsne[[,1]]
df[df$cellType == c,]$tSNE_2 <- c_cells_tsne[[,2]]
}
Plots colored by the infection status of each cell.
df %>%
arrange(viralLoad) %>%
ggplot(aes(x = tSNE_1, y = tSNE_2, color = Is_infected)) +
geom_point(size = 1.5, alpha = 0.6) +
facet_wrap("cellType", scales = "free", labeller = labeller()) +
scale_color_manual(values = c("0" = "blue", "1" = "red")) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(title = "t-SNE cell type") +
xlab("tSNE_1") +
ylab("tSNE_2")
Plots colored by patient id.
df %>%
arrange(viralLoad) %>%
ggplot(aes(x = tSNE_1, y = tSNE_2, color = PatientID)) +
geom_point(size = 1.5, alpha = 0.6) +
facet_wrap("cellType", scales = "free", labeller = labeller()) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(title = "t-SNE cell type") +
xlab("tSNE_1") +
ylab("tSNE_2")
celltypes <- unique(data$obs$cellType)
n_clusters = 2 # number of clusters
df["kmeans"] <- 0
df["hierarchical"] <- 0
for (c in celltypes){
c_subset = df[df$cellType == c,]
max_ari =-1
# Kmeans:
cluster <- kmeans(c_subset[, 9:10], iter = 10000, nstart = 100, n_clusters) # kmeans clustering
matrix_of_ari[c, "cor_kmeans"] = adj.rand.index(cluster$cluster, c_subset$Is_infected) # ARI for kmeans clustering
df[df$cellType == c,]$kmeans <- factor(cluster$cluster)
# Hierarchical
for (m in c("complete", "single", "average", "centroid")){
dm <- dist(as.data.frame(c_subset[, 9:10])) # Distance matrix
hc <- hclust(dm, method = "centroid") # simple dendrogram
clusterCutS <- cutree(hc, n_clusters) # data, number of clusters
ari = adj.rand.index(clusterCutS, c_subset$Is_infected)
df[df$cellType == c,]$hierarchical <- factor(clusterCutS)
if (max_ari < ari) {
matrix_of_ari[c, "cor_hier"] <- ari
max_ari = ari}
}}
matrix_of_ari <- round(matrix_of_ari, 4)
matrix_of_ari[,5:6]
## cor_kmeans cor_hier
## Macrophage 0.0034 0.0000
## Plasma 0.8125 0.8268
## Secretory 0.0108 0.0149
## Neutrophil 0.0409 0.0000
## NK 0.0094 0.0831
## T 0.2201 0.0213
## Ciliated 0.1025 0.0671
## Squamous -0.0036 0.1653
ggplot(df, aes(x = tSNE_1, y = tSNE_2 , color = factor(kmeans))) +
geom_point(size = 1.5, alpha = 0.6) +
facet_wrap("cellType", scales = "free", labeller = labeller()) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(title = "Kmeans on scVI", subtitle = "Clustered by infection; k = 2") +
xlab("tSNE_1") +
ylab("tSNE_2")
ggplot(df, aes(x = tSNE_1, y = tSNE_2 , color = factor(hierarchical))) +
geom_point(size = 1.5, alpha = 0.6) +
facet_wrap("cellType", scales = "free", labeller = labeller()) +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(title = "Hierarchical on scVI", subtitle = "Clustered by infection; k = 2") +
xlab("tSNE_1") +
ylab("tSNE_2")+
theme(legend.position = "none")
A heatmap displaying the ARI score of each combination of dimention reduction methods and clustering method with the different celltypes.
heatmap.2(
matrix_of_ari,
Colv = NA,
Rowv = NA,
dendrogram = "none",
cexRow=0.8,
cexCol = 0.8,
trace = "none",
main = "ARI with different methods",
notecol = "black",
key = TRUE,
cellnote = matrix_of_ari)
This loop-based approach utilizes the adjusted Rand index (ARI) as a measure to guide the search for the best parameter values. By systematically iterating through different combinations of parameters and evaluating their performance using the ARI, this approach helps identify the parameter values that yield the highest similarity between predicted and true labels. It saves the coordinates in the dataset to be used for clustering plots in ‘Clean_Clustering’.
perplexity_list <- c(5, 30, 50, 100, 150)
theta_list <- c(0.15, 0.25, 0.5, 0.8, 0.99)
n_clusters = 8 # number of clusters
max_ARI_kmeans_tsne = 0
max_ARI_hier_tsne = 0
for (p in perplexity_list) {
for (t in theta_list) {
tsne_test <- RunTSNE(
data$obsm$X_scVI_corrected,
assay = "RNA",
seed.use = 10,
tsne.method = "Rtsne",
dim.embed = 2,
perplexity = p,
theta = t
)
cluster <- kmeans(tsne_test[[, 1:2]], iter = 10000, nstart = 100, n_clusters) # kmeans clustering
ARI_tsne = adj.rand.index(cluster$cluster, data$obs$cellType) # ARI for kmeans clustering
if (max_ARI_kmeans_tsne < ARI_tsne) {
max_ARI_kmeans_tsne = ARI_tsne
best_kmeans_tsne <- tsne_test}
for (m in c("complete", "single", "average", "centroid")){
dm <- dist(as.data.frame(tsne_test[[, 1:2]])) # Distance matrix
hc <- hclust(dm, method = m) # simple dendrogram
clusterCutS <- cutree(hc, n_clusters) # data, number of clusters
ARI_hier_tsne = adj.rand.index(clusterCutS, data$obs$cellType)
if (max_ARI_hier_tsne < ARI_hier_tsne) {
max_ARI_hier_tsne = ARI_hier_tsne
best_hier_tsne <- tsne_test}
}
}
}
# We save the coordinates that gives the highest ARI in the pilot set to plot the clusterings later.
if (max_ARI_kmeans_tsne > max_ARI_hier_tsne) {
matrix_data <-
matrix(
data = c(best_kmeans_tsne[[, 1]], best_kmeans_tsne[[, 2]]),
nrow = length(best_kmeans_tsne[[, 1]]),
ncol = 2
)
data$obsm$X_scVI_corrected_tSNE <- matrix_data
write_h5ad(data, '../Data/Pilot_2_rule_them_ALL.h5ad')
} else {
matrix_data <-
matrix(
data = c(best_hier_tsne[[, 1]], best_hier_tsne[[, 2]]),
nrow = length(best_hier_tsne[[, 1]]),
ncol = 2
)
data$obsm$X_scVI_corrected_tSNE <- matrix_data
write_h5ad(data, '../Data/Pilot_2_rule_them_ALL.h5ad')
}